#importing libraries
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
# creating back up file
loan_data_backup = pd.read_csv('/Users/vladimirant/Desktop/Walmart Data Science Interview/Data Science HW design val.csv')
# creating working data file
loan_data_train=loan_data_backup.copy()
loan_data_train.head()
loan_data_train.tail()
#getting information about data types of the variables
#all the variables are float or integer format, which is OK
loan_data_train.info()
loan_data_train.describe()
#Checking for missing values
pd.options.display.max_rows = None
# Sets the pandas dataframe options to display all columns/ rows.
loan_data_train.isnull().sum()
#NO missinng values
# Check for unique values of the variables
for i in loan_data_train.columns:
print(i,len(loan_data_train[i].unique()),loan_data_train[i].unique())
loan_data_train["A24"].unique()
loan_data_train["A22"].unique()
#Plotting heatmap/correlation table
import seaborn as sb
plt.subplots(figsize=(30,30))
corr = loan_data_train.corr()
sb.heatmap(corr, annot=True,)
#Checking correlation table for highly correlated variables
corr>0.75
#https://www.projectpro.io/recipes/drop-out-highly-correlated-features-in-python
cor_matrix =loan_data_train.corr().abs()
print(cor_matrix)
upper_tri = cor_matrix.where(np.triu(np.ones(cor_matrix.shape),k=1).astype(np.bool))
print(upper_tri)
#Looking for highly correlated variables
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.75)]
print();
print(to_drop)
# So these features are highly correlated features that needs to be deleted
# Analysing dependent variable
#Plotting the disctribution of dependent variable on traininng_set
# increase font size of all elements on the graph using font_scale
sns.set(font_scale=3)
plt.subplots(figsize=(20,20))
sns.countplot(x='default',data=loan_data_train)
# So we can see that we have unbalanced datset
#The percentage of default loans in the training dataset is
sum(loan_data_train['default'])/11500
#reduced train_dataset (excluding highly correlated variables)
loan_data_train_reduced=loan_data_train.drop(columns=to_drop)
loan_data_train_reduced.tail()
# Now lets do the same analysis for test set
loan_data_test=pd.read_csv('/Users/vladimirant/Desktop/Walmart Data Science Interview/Data Science HW design val.csv')
loan_data_test.tail()
#Checking for missing values
pd.options.display.max_rows = None
# Sets the pandas dataframe options to display all columns/ rows.
loan_data_test.isnull().sum()
#Distribution of target varable in the test set (same distribution)
#increase font size of all elements
sns.set(font_scale=3)
plt.subplots(figsize=(20,20))
sns.countplot(x='default',data=loan_data_test)
#We have the same proportion of defaults in test dataset
sum(loan_data_test['default'])/11500
loan_data_test_reduced=loan_data_test.drop(columns=to_drop)
# Creating inputs for future models
X_train, Y_train=loan_data_train_reduced[['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
'A30']],loan_data_test_reduced['default']
len(['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
'A30'])
#creating X and Y for a test set (reduced test_set)
X_test, Y_test=loan_data_test_reduced[['A1', 'A3', 'A4', 'A5', 'A7', 'A8', 'A9', 'A13', 'A14', 'A15', 'A16',
'A17', 'A20', 'A21', 'A22', 'A23', 'A25', 'A26', 'A27', 'A28', 'A29',
'A30']],loan_data_test_reduced['default']
#Fitting the model on the train set
import statsmodels.api as sm
logit_model=sm.Logit(Y_train,X_train)
result=logit_model.fit()
print(result.summary2())
# So we need to exclude A4, A13, A15, A21, A23, A25, A26, A27, A29, A30
# because of high p-value (>0.05)
#creating new x-vector
X_train_2, Y_train=loan_data_train_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
'A17', 'A20','A28']],loan_data_test_reduced['default']
#building the model with new x-vector
import statsmodels.api as sm
logit_model_2=sm.Logit(Y_train,X_train_2)
result=logit_model_2.fit()
print(result.summary2())
#Excluding A28 due high p-value
X_train_3, Y_train=loan_data_train_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
'A17', 'A20']],loan_data_test_reduced['default']
# building log reg model with new vector X
#Now we can see that allthe coefficients are significant
import statsmodels.api as sm
logit_model_3=sm.Logit(Y_train,X_train_3)
result=logit_model_3.fit()
print(result.summary2())
#building "final' logistic regression
#fitting the model on the last training set
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
logreg = LogisticRegression()
logreg.fit(X_train_3, Y_train)
#predicting using the model on the train dataset
# calcaulating the accuracy on the training set
y_pred = logreg.predict(X_train_3)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_train_3, Y_train)))
#confusion matrix on the train dataset
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(Y_train, y_pred)
print(confusion_matrix)
#confusion matrix for the train set
from sklearn.metrics import classification_report
print(classification_report(Y_train, y_pred))
# we can clearly see that the model built on original dataset
# have troubles with predicting class 1:
# all metrics of confusion matrix are lower for class 1 than for class 0
#AUC for train_set (Logistic Reg)
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
logit_roc_auc = roc_auc_score(Y_train, logreg.predict(X_train_3))
fpr, tpr, thresholds = roc_curve(Y_train, logreg.predict_proba(X_train_3)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
#Let's check AUC for a test_set
X_test_3=loan_data_test_reduced[['A1', 'A3', 'A5', 'A7', 'A8', 'A9', 'A14', 'A16',
'A17', 'A20']]
y_pred_test = logreg.predict(X_test_3)
print('Accuracy of logistic regression classifier on test set: {:.2f}'.format(logreg.score(X_test_3, Y_test)))
logit_roc_auc = roc_auc_score(Y_test, logreg.predict(X_test_3))
fpr, tpr, thresholds = roc_curve(Y_test, logreg.predict_proba(X_test_3)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
#now I am trying to run logistic regression on scaled data
#Scaling the data
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_sc=scaler.fit(X_train)
X_train_scaled=scaler.transform(X_train)
X_test_scaled=scaler.transform(X_test)
logit_model_scaled=sm.Logit(Y_train,X_train_scaled)
result=logit_model_scaled.fit()
print(result.summary2())
#Now let's try it for final set of variables choosen by initially selected variables
# creating scaling inputs
X_train3_sc=scaler.fit(X_train_3)
X_train_3_scaled=scaler.transform(X_train_3)
X_test_3_scaled=scaler.transform(X_test_3)
# Log Reg on scaled data
logreg_scaled = LogisticRegression()
logreg_scaled.fit(X_train_3_scaled, Y_train)
#building logistic regression on scaled data
logit_model_scaled_3=sm.Logit(Y_train,X_train_3_scaled)
result=logit_model_scaled_3.fit()
print(result.summary2())
#AUC on train set for scaled data
logit_roc_auc = roc_auc_score(Y_train, logreg_scaled.predict(X_train_3_scaled))
fpr, tpr, thresholds = roc_curve(Y_train, logreg_scaled.predict_proba(X_train_3_scaled)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
#AUC on test set for scaled data
logit_roc_auc = roc_auc_score(Y_test, logreg_scaled.predict(X_test_3_scaled))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_scaled.predict_proba(X_test_3_scaled)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Using scaling data did not help to improve the model
# The train set looks unbalanced, thus i am trying to use more balanced dataset
# I am reducing the trainset so it has equal proportion of both clasess
# train_set positives
train_set_positives=loan_data_train.loc[loan_data_train['default'] == 1]
train_set_positives.tail()
# train_set negatives
train_set_negatives=loan_data_train.loc[loan_data_train['default'] == 0]
train_set_negatives.tail()
#sample of negatives
train_set_negatives_1500=train_set_negatives.sample(n=1500, random_state=3)
#creating balanced training set
train_set_US=pd.concat([train_set_negatives_1500, train_set_positives], ignore_index=True)
train_set_US.describe()
train_set_US.tail()
# building inputs from new training set for the same logistic model
#Plotting heatmap/correlation table
plt.subplots(figsize=(70,70))
corr2 = train_set_US.corr()
sb.heatmap(corr2, annot=True,)
#finding highly correlated features to drop
cor_matrix2 = train_set_US.corr().abs()
upper_tri2 = cor_matrix2.where(np.triu(np.ones(cor_matrix2.shape),k=1).astype(np.bool))
to_drop_US = [column for column in upper_tri2.columns if any(upper_tri2[column] > 0.75)]
to_drop_US
#Creting reduced train set for Und Samp
train_set_US_reduced=train_set_US.drop(columns=to_drop_US)
#Crearing train inputs for US data
X_train_US=train_set_US_reduced.loc[:,train_set_US_reduced.columns!='default']
Y_train_US=train_set_US_reduced['default']
#Log reg for US
# building log reg model with new vector X for US
logit_model_US=sm.Logit(Y_train_US,X_train_US)
result=logit_model_US.fit()
print(result.summary2())
#dropping variables based on p-values
X_train_US_2=X_train_US[['A1','A3','A5','A7','A8', 'A15','A16']]
# building log reg model with new vector X
logit_model_US_2=sm.Logit(Y_train_US,X_train_US_2)
result=logit_model_US_2.fit()
print(result.summary2())
#all variables are significant, lets run the model
# Log Reg on US data
logreg_US= LogisticRegression()
logreg_US.fit(X_train_US_2, Y_train_US)
#AUC for train_set (Logistic Reg US data)
logit_roc_auc = roc_auc_score(Y_train_US, logreg_US.predict(X_train_US_2))
fpr, tpr, thresholds = roc_curve(Y_train_US, logreg_US.predict_proba(X_train_US_2)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# As we can see Logistic regression on the balanced dataset improved and use fewer cariables
X_test_US=loan_data_test[['A1','A3','A5','A7','A8', 'A15','A16']]
#AUC for test_set (Logistic Reg US data)
logit_roc_auc = roc_auc_score(Y_test, logreg_US.predict(X_test_US))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_US.predict_proba(X_test_US)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
# Predicted probabilities to be classified as 1
LR_prob1_array=logreg_US.predict_proba(X_test_US)[:,1]
df_res1=pd.DataFrame(LR_prob1_array)
# saving the dataframe
df_res1.to_csv('results1.csv')
#Importing libraries
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier
#Creating Random Forest Clasifier
model_RFC = RandomForestClassifier()
#Running cross validation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
n_scores = cross_val_score(model_RFC, X_train, Y_train, scoring='roc_auc', cv=cv, n_jobs=-1, error_score='raise')
#Auc of the model on the train set (no US)
np.mean(n_scores)
#Fitting the model on the train set
model_RFC.fit(X_train, Y_train)
#Predicting model on the test_set
model_RFC.predict(X_test)
#measuring AUC on test_set for RFC|
sklearn.metrics.roc_auc_score(Y_test,model_RFC.predict(X_test))
#creating array of output probability predictions to be classified as 1
RFC_array=model_RFC.predict_proba(X_test)
RFC_array_p1=RFC_array[:,1]
df_res2=pd.DataFrame()
# saving the dataframe
df_res2.to_csv('results2.csv')
#Checking the resulting array for ML model
RFC_array_p1
#Logistic regression trained on the balanced dataset provides better performance
#both on the train and test set with AUC equal to 0.74
#in comparison with LR model trained on the whole dataset (AUC 0.61)
# Random forest classifier provides higher accuracy than Logistic Regression
# With perfect AUC on the test dataset
# As bonus i decided to check LR with OVERSAMPLING
# Just for experimental purposes
# Normally for the logistic regression data should not come from repeated
# measurements
# Now I want to try logistic regression with Over sampling
train_set_positives.tail()
#Checking how can i call specific rows
train_set_positives.loc[[2,3,2]]
#Generating indexes for oversampling
a=range(1500)
#Generating indexes for oversampling
np.random.seed(0)
OS_indexes=np.random.choice(a,size=10000)
#Creating 10000 positves set
train_set_positives_OS=train_set_positives.loc[OS_indexes]
train_set_positives_OS.tail()
#creating balanced training set Oversampling
train_set_OS=pd.concat([train_set_negatives, train_set_positives_OS], ignore_index=True)
train_set_OS.describe()
#Plotting heatmap/correlation table and finding the highly correlated variables to drop
#Doing the same as we deed before
plt.subplots(figsize=(70,70))
corr3 = train_set_OS.corr()
sb.heatmap(corr3, annot=True,)
cor_matrix3 = train_set_OS.corr().abs()
upper_tri3 = cor_matrix3.where(np.triu(np.ones(cor_matrix3.shape),k=1).astype(np.bool))
to_drop_OS = [column for column in upper_tri3.columns if any(upper_tri3[column] > 0.75)]
to_drop_OS
#Creting reduced train set for Ov Samp
train_set_OS_reduced=train_set_US.drop(columns=to_drop_OS)
#Crearing train inputs for OS data
X_train_OS=train_set_OS_reduced.loc[:,train_set_OS_reduced.columns!='default']
Y_train_OS=train_set_OS_reduced['default']
# building Log Reg model with new vector X for US
logit_model_OS=sm.Logit(Y_train_OS,X_train_OS)
result=logit_model_OS.fit()
print(result.summary2())
#dropping variables based on p-values
X_train_OS_2=X_train_US[['A1','A3','A5','A7','A8','A15','A16']]
# building Log Reg model with new vector X for US
logit_model_OS_2=sm.Logit(Y_train_OS,X_train_OS_2)
result=logit_model_OS_2.fit()
print(result.summary2())
#All variables are significant
# Log Reg on OS data
logreg_OS= LogisticRegression()
logreg_OS.fit(X_train_OS_2, Y_train_OS)
#AUC for train_set (Logistic Reg OS data)
logit_roc_auc = roc_auc_score(Y_train_OS, logreg_OS.predict(X_train_OS_2))
fpr, tpr, thresholds = roc_curve(Y_train_OS, logreg_OS.predict_proba(X_train_OS_2)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()
#AUC for test_set (Logistic Reg OS data)
X_test_OS=loan_data_test[['A1','A3','A5','A7','A8', 'A15','A16']]
logit_roc_auc = roc_auc_score(Y_test, logreg_OS.predict(X_test_OS))
fpr, tpr, thresholds = roc_curve(Y_test, logreg_OS.predict_proba(X_test_OS)[:,1])
plt.figure(figsize=(20,20))
plt.plot(fpr, tpr, label='Logistic Regression (area = %0.2f)' % logit_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.savefig('Log_ROC')
plt.show()